# OUTLINE: # 1. Data visualization# 2. Data wrangling and the Tidyverse (dplyer)# 3. Look over Assignment 1# Load packageslibrary(tidyverse) #ggplot, readr, dplyer, etc.library(haven) #opening Stata datasetslibrary(skimr) #descriptive statisticslibrary(nycflights13) #load datasets from moderndive book# These we'll need to install#install.packages("scales")library(scales) #for labeling axes with %, $, etc.#install.packages("ggridges")library(ggridges)#install.packages("moderndive")library(moderndive)#install.packages("ggrepel")library(ggrepel)#install.packages("questionr")library(questionr) #for frequency tables# Load datanes <-read_dta("nes.dta")states <-read_dta("states.dta")vdem <-read_dta("vdem.dta")# Calling up variables in datasets: $ method, "data$variable"skim(vdem$v2x_polyarchy)
Data summary
Name
vdem$v2x_polyarchy
Number of rows
179
Number of columns
1
_______________________
Column type frequency:
numeric
1
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
data
0
1
0.52
0.25
0.02
0.29
0.52
0.75
0.91
▃▆▆▆▇
# For skimr, you can also use use (data, variable)skim(vdem, v2x_polyarchy)
Data summary
Name
vdem
Number of rows
179
Number of columns
4171
_______________________
Column type frequency:
numeric
1
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
v2x_polyarchy
0
1
0.52
0.25
0.02
0.29
0.52
0.75
0.91
▃▆▆▆▇
# Frequency table, using questionr package: freq(vdem$v2x_regime)
###################################################################################### ################################## Histograms ################################## ###################################################################################### # We'll use the "weather" data from nycflights13 package; # Hourly meteorological data for LGA, JFK and EWR.# We want to visualize the distribution of a single variable with a histogram; # these are ideal for "continuous" variables with lots of values.weather <- weather#?weather# We'll focus on temperatures ("temp") at these three airports# Note: geometry for a histogram is "geom_histogram"ggplot(data = weather, mapping =aes(x = temp)) +geom_histogram()
# Too clumpy; let's delineate the bars some more. Always use color = "white" option.ggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(color ="white")
# We can also adjust the "bins" or the width of the bars; more or less fine-grainedggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(bins =40, color ="white")
# Facet wraps: These are very cool for subsetting by some groupingggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(binwidth =5, color ="white") +facet_wrap(~month)
# We can also report y-axis in percentage terms; the third line uses # the "scales" package# also, label axes, give titleggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth =2.029083, color ="white") +scale_y_continuous(labels=percent) +labs(x="Temperature", y="Percentage of Hours", title="Temperatures")
# Now change labeling for y-axis valuesggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth =2.029083, color ="white") +scale_y_continuous(limits =c(0, .06), breaks =seq(0, .06, by = .01), labels=percent) +labs(x="Temperature", y="Percentage of Hours", title="Temperatures")
# Change theme for graph background# Now change labeling for y-axis valuesggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth =2.029083, color ="white") +scale_y_continuous(limits =c(0, .06), breaks =seq(0, .06, by = .01), labels=percent) +labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +theme_minimal()
# Center titleggplot(data = weather, mapping =aes(x = temp)) +geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth =2.029083, color ="white") +scale_y_continuous(limits =c(0, .06), breaks =seq(0, .06, by = .01), labels=percent) +labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +theme_minimal() +theme(plot.title =element_text(hjust =0.5))
########### NOTE ON SYNTAX ################ There are some aspects of the syntax used above that are unnecessary and that we can# eliminate. For example: # This command: ggplot(data = weather, mapping =aes(x = temp)) +geom_histogram()
# is identical to this commmand (note that I've removed "data=" and "mapping="): ggplot(weather, aes(x = temp)) +geom_histogram()
# You actually don't even need "x=" but I like to include that to make it clear.ggplot(weather, aes(temp)) +geom_histogram()
# From now on, we'll simplify to the second command###################################################################################### ################################## Density Plots ################################## ###################################################################################### ggplot(weather, aes(x = temp)) +geom_density() +labs(x="Temperature", y="Density", title="Temperatures") +theme(plot.title =element_text(hjust =0.5))
# Fill with colorggplot(weather, aes(x = temp)) +geom_density(fill="dodgerblue") +labs(x="Temperature", y="Density", title="Temperatures") +theme(plot.title =element_text(hjust =0.5))
# Make transparent with "alpha"ggplot(weather, aes(x = temp)) +geom_density(fill="dodgerblue", alpha=.5) +labs(x="Temperature", y="Density", title="Temperatures") +theme(plot.title =element_text(hjust =0.5))
# Ridge plots# Notice I have to change month to "factor" variable. Most variables are "continuous"# by default. I need to change this to "discrete" or ggplot can't graph it like # I want it to.ggplot(weather, aes(x = temp, y=factor(month))) +geom_density_ridges(fill="dodgerblue", alpha=.5) +labs(x="Temperature", y="Month", title="Temperatures") +theme_minimal() +theme(plot.title =element_text(hjust =0.5))
###################################################################################### ################################## Boxplots ################################## ###################################################################################### # Boxplots are another way to visualize the distribution of a single variable. These are # again ideal for "continuous" variables with lots of values.# What do boxplots give us exactly? "Low", 25th pctile, 50th pctile (median), 75th pctile, # and "high"; and outliers.# Let's again use the weather data: temperature # Geometry for boxplot is "geom_boxplot"ggplot(weather, aes(y = temp)) +geom_boxplot()
# Get descriptive statistics using skimr packageskim(weather, temp)
Data summary
Name
weather
Number of rows
26115
Number of columns
15
_______________________
Column type frequency:
numeric
1
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
temp
1
1
55.26
17.79
10.94
39.92
55.4
69.98
100.04
▂▇▇▇▁
# Use more fine-grained y-axis labelsggplot(weather, aes(y = temp)) +geom_boxplot() +scale_y_continuous(limits =c(0, 100), breaks =seq(0, 100, by =5))
# Now let's look at temp by month.# We need to treat month as a "factor" or categorical variable instead of a continuous # variable. We'll talk more about this distinction later in the class.ggplot(weather, aes(x =factor(month), y = temp)) +geom_boxplot() +labs(x="Month", y="Temperature")
# Interpretation: "The 'whiskers' are set to extend out no more than 1.5 × IQR # (75th minus 25th) units away from either end of the boxes. We say 'no more than' # because the ends of the whiskers have to correspond to observed temperatures."# Note that the dots outside the whiskers are "outliers." ###################################################################################### ################################## Barplots ################################## ###################################################################################### # Barplots are ideal for visualizing the spread of a "categorical" # or "discrete" variable with few values, i.e., ordinal or nominal variable.# This is related to a frequency distribution.# Let's use the flights data from nycflights13 to examine the frequency of flights by # carrier. flights <- flights# Let's use the "carrier" variable.#?flights# We could get a frequency distribution in table form using the # "freq" command from the questionr package.freq(flights$carrier)
n % val%
9E 18460 5.5 5.5
AA 32729 9.7 9.7
AS 714 0.2 0.2
B6 54635 16.2 16.2
DL 48110 14.3 14.3
EV 54173 16.1 16.1
F9 685 0.2 0.2
FL 3260 1.0 1.0
HA 342 0.1 0.1
MQ 26397 7.8 7.8
OO 32 0.0 0.0
UA 58665 17.4 17.4
US 20536 6.1 6.1
VX 5162 1.5 1.5
WN 12275 3.6 3.6
YV 601 0.2 0.2
# Now ggplot; geometry for bar graph is "geom_bar"ggplot(flights, aes(x = carrier)) +geom_bar()
# Stacked bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)ggplot(flights, aes(x = carrier, fill = origin)) +geom_bar()
# Grouped bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)ggplot(flights, aes(x = carrier, fill = origin)) +geom_bar(position ="dodge")
# We could also facet wrapggplot(flights, aes(x = carrier)) +geom_bar() +facet_wrap(~ origin, ncol =1)
# Note how y-axis is a count by default. We could report as a proportion instead.ggplot(flights, aes(x = carrier)) +geom_bar(aes(y = (..count..)/sum(..count..)))
# Percentage much more intuitive; the third line uses the package "scales", which # converts the proportion to a percentage and adds a percent sign.ggplot(flights, aes(x = carrier)) +geom_bar(aes(y = (..count..)/sum(..count..))) +scale_y_continuous(labels=percent)
# Report y-axis labels as percentage; the third line uses the package "scales"; label axesggplot(flights, aes(x = carrier)) +geom_bar(aes(y = (..count..)/sum(..count..))) +scale_y_continuous(labels=percent) +labs(x="Carrier", y="Percentage")
###################################################################################### ################################## Scatterplots ################################## ###################################################################################### # Alaska data (from moderndive package); let's view it firstalaska_flights <- alaska_flights# Variable descriptions#?alaska_flights# Basic scatterplot; note how the plus sign separates different arguments.# Note how the three elements of "grammar of graphics" are implemented with ggplot.# The geometry name for a scatterplot is "geom_point"ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point()
# What sort of relationship do we visualize? How do we *communicate* this? # We can add different features, which we'll do throughout the semester. # Let's add more informative labels for y and x axisggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point() +labs(x="Departure Delay", y="Arrival Delay")
# Let's say we wanted a white background in the plotggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point() +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Transparent dots and white backgroundggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point(alpha = .2) +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Make dots smallerggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point(size=.5, alpha=.2) +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# "Jitter" the points -- good option if there are many overlapsggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_jitter(width =30, height =30) +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Color code by monthggplot(alaska_flights, aes(x = dep_delay, y = arr_delay, color=factor(month))) +geom_jitter(width =30, height =30) +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Let's add a "line of best fit" to the plotggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point() +geom_smooth(method="lm") +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Take out the confidence interval (shading around line)ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point() +geom_smooth(method="lm", se=FALSE) +labs(x="Departure Delay", y="Arrival Delay") +theme_minimal()
# Set limits and increments for y-axis labelsggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +geom_point() +geom_smooth(method="lm", se=FALSE) +labs(x="Departure Delay", y="Arrival Delay") +scale_y_continuous(limits =c(-100, 200), breaks =seq(-100, 200, by =50)) +theme_minimal()
###################################################################################### ################################## Linegraphs ################################## ###################################################################################### # We'll use the "early_january_weather" data from moderndive packageearly_january_weather <- early_january_weather#?early_january_weather# Let's plot temperatures ("temp") in Jan. against time ("time_hour")# The geometry for a linegraph is "geom_line"ggplot(early_january_weather, aes(x = time_hour, y = temp)) +geom_line()
# One other cool graph is a "line smoother" to visualize the trend in a more effective wayggplot(early_january_weather, aes(x = time_hour, y = temp)) +geom_line() +geom_smooth(se=FALSE)
# We can change the limits on each axis; let's change the y axisggplot(early_january_weather, aes(x = time_hour, y = temp)) +geom_line() +geom_smooth(se=FALSE) +ylim(0, 75) +labs(x="Time/hour", y="Temperature")
# We can also set the limits and the increments we want on y-axis labelsggplot(early_january_weather, aes(x = time_hour, y = temp)) +geom_line() +geom_smooth(se=FALSE) +labs(x="Time/hour", y="Temperature") +scale_y_continuous(limits =c(0, 75), breaks =seq(0, 75, by =10))
############################## DATA WRANGLING (Ch. 3) ############################### All data wrangling tools in Ch. 3 are from the package "dplyer," which is loaded# when you load the "tidyverse" package. ############################# 1. The Pipe Operator, |> ############################## Allows you to link different aspects of code. "First, I'll do this, # THEN I'll do that, # THEN I'll do this." # The pipe operator is the "THEN", or the connector.# Background on pipes. There are two pipes that are the same: # 1. %>% from tidyverse# 2. |> from base R# We will use the second one. But when you see reference to the first, realize# they are the same.# Keyboard shortcut: shift-command-m for Macs, shift-ctrl-m for PCs. |> # Change to "native pipe," or |> # Click on Edit -> Preferences; click "Code" on sidebar; # check "Use native pipe operator, |> # Example# This command:ggplot(flights, aes(x = carrier)) +geom_bar()
# is identical to this command:flights |>ggplot(aes(x = carrier)) +geom_bar()
# Note the difference. "Use this data, flights, THEN produce a bar graph.########################## 2. Filter: Select subset of observations ####################### Let's use the "states" data that you'll use in Assignment 1# Note that you can list all the variables in the dataset by running:ls(states)
# Let's say we wanted to create two separate datasets: One for Southern states and# one for non-Southern states. We can use the "filter" command# First, let's see how "south" is coded; note 1=South, 0=non-Southfreq(states$south)
n % val%
[0] Nonsouth 34 68 68
[1] South 16 32 32
# Create new data object for Southern states; note how we use the pipe operatorsouth <- states |>filter(south==1)# Non-Southnonsouth <- states |>filter(south==0)# Now we can describe voter turnout for Southern and non-Southern statesnonsouth |>ggplot(aes(y=vep16_turnout)) +geom_boxplot() +ylim(40,80)
south |>ggplot(aes(y=vep16_turnout)) +geom_boxplot() +ylim(40,80)
# Or, use original states data; check out labelsstates |>ggplot(aes(x=factor(south), y=vep16_turnout)) +geom_boxplot() +ylim(40,80) +labs(x="non-South v. South", y="Voter Turnout 2016") +scale_x_discrete(labels=c("non-South", "South"))
# One huge value of the pipe is that we actually would not need to create the two # new data objects for South and non-South. We can integrate "subsetting" with the # the graphing or data command we want to execute.# This: south |>ggplot(aes(y=vep16_turnout)) +geom_boxplot() +ylim(40,80)
# generates the exact same graph as this: states |>filter(south==1) |>ggplot(aes(y=vep16_turnout)) +geom_boxplot() +ylim(40,80)
########################## 2. Summarize ####################### We can use this command to generate summary statistics, like mean, median, standard# deviation, etc. states |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))
# A tibble: 1 × 2
mean std_dev
<dbl> <dbl>
1 61.7 6.38
# We could also save this information as an object, called a "tibble." stats <- states |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))# Type "stats" to see outputstats
# A tibble: 1 × 2
mean std_dev
<dbl> <dbl>
1 61.7 6.38
# We can also combine filter and summarize; let's use South and non-South examplestates |>filter(south==1) |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))
# A tibble: 1 × 2
mean std_dev
<dbl> <dbl>
1 58.7 5.90
states |>filter(south==0) |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))
# A tibble: 1 × 2
mean std_dev
<dbl> <dbl>
1 63.0 6.20
# Use skimr package to get mean, sd, and more.states |>filter(south==1) |>skim(vep16_turnout)
Data summary
Name
filter(states, south == 1…
Number of rows
16
Number of columns
200
_______________________
Column type frequency:
numeric
1
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
vep16_turnout
0
1
58.72
5.9
50.1
52.92
59.5
64.75
67.2
▇▂▆▂▇
states |>filter(south==0) |>skim(vep16_turnout)
Data summary
Name
filter(states, south == 0…
Number of rows
34
Number of columns
200
_______________________
Column type frequency:
numeric
1
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
vep16_turnout
0
1
63.04
6.2
43
59.7
63.5
65.7
74.8
▁▁▇▇▃
########################## 3. Group by ####################### This groups observations by variable values and executes commands for each group# Let's get avg. turnout for each of four regions in the U.S., using the "reg" variable# 1=NE, 2=MW, 3=South, 4=Westfreq(states$reg)
n % val%
[1] Northeast 9 18 18
[2] Midwest 12 24 24
[3] South 16 32 32
[4] West 13 26 26
region <- states |>group_by(reg) |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))# See outputregion
# No need to save as region, though. You can go directly to this: states |>group_by(reg) |>summarize(mean =mean(vep16_turnout), std_dev =sd(vep16_turnout))
# Scatterplot with labels AND ggrepelstates |>ggplot(aes(x=union_2016, y=vep16_turnout)) +geom_point() +labs(x="Union", y="Percent Turnout") +geom_smooth(method="lm", se=FALSE) +geom_text_repel(aes(label=StateID))
# Scatterplot with labels AND ggrepel; make labels smallerstates |>ggplot(aes(x=union_2016, y=vep16_turnout)) +geom_point() +labs(x="Union", y="Percent Turnout") +geom_smooth(method="lm", se=FALSE) +geom_text_repel(aes(label=StateID), size=3)
# Scatterplot with labels AND ggrepel; make labels smaller; add % to axis labelsstates |>ggplot(aes(x=union_2016, y=vep16_turnout)) +geom_point() +labs(x="Union", y="Percent Turnout") +geom_smooth(method="lm", se=FALSE) +geom_text_repel(aes(label=StateID), size=3) +scale_y_continuous(label=percent) +scale_x_continuous(label=percent)
# Scatterplot with labels AND ggrepel; make labels smaller; add % to axis labels# Note: label=percent assumes we have proportions. So we can change transform our # variables right in the aes(x=....)states |>ggplot(aes(x=union_2016/100, y=vep16_turnout/100)) +geom_point() +labs(x="Union", y="Percent Turnout") +geom_smooth(method="lm", se=FALSE) +geom_text_repel(aes(label=StateID), size=3) +scale_y_continuous(label=percent) +scale_x_continuous(label=percent)
########################## 4. Create new variables using "mutate" ############ Let's say we wanted to change variable name for turnout and save in "st" object# NEVER override existing data. Instead, create new variable.states <- states |>mutate(turnout=vep16_turnout)# Let's say we need to recode a variable. Again, don't override existing var. # Create new one# NES example from Assignment 1freq(nes$V201228)
nes |>ggplot(aes(x=factor(pid3))) +geom_bar() +scale_x_discrete(labels=c("Dem", "Ind", "Rep"))
nes |>filter(!is.na(pid3)) |>ggplot(aes(x=factor(pid3))) +geom_bar() +scale_x_discrete(labels=c("Dem", "Ind", "Rep"))
nes |>filter(!is.na(pid3)) |>ggplot(aes(x=factor(pid3))) +geom_bar(width=.5) +scale_x_discrete(labels=c("Dem", "Ind", "Rep"))
# Report pct and add appropriate labels########################## 5. Select: subset variables ####################### Let's say we're working with the states data, but we want to subset JUST the variables# that we're analyzing. From Assignment 1, we use "vep16_turnout" and "ba_or_more_2015." # Let's use "state" and "StateID" as well.st_reduced <- states |>select(state, StateID, vep16_turnout, ba_or_more_2015)# You can also rename variables, create more intuitive var namesst_reduced <- states |>select(state, StateID, vep16_turnout, ba_or_more_2015) |>mutate(turnout=vep16_turnout, ba=ba_or_more_2015)